1 package org.apache.lucene.analysis.ja.dict;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.BufferedReader;
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.Comparator;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.TreeMap;
29
30 import org.apache.lucene.analysis.ja.util.CSVUtil;
31 import org.apache.lucene.util.IntsRef;
32 import org.apache.lucene.util.IntsRefBuilder;
33 import org.apache.lucene.util.fst.Builder;
34 import org.apache.lucene.util.fst.FST;
35 import org.apache.lucene.util.fst.PositiveIntOutputs;
36
37
38
39
40
41 public final class UserDictionary implements Dictionary {
42
43
44 private final TokenInfoFST fst;
45
46
47 private final int segmentations[][];
48
49
50 private final String data[];
51
52 private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
53
54 public static final int WORD_COST = -100000;
55
56 public static final int LEFT_ID = 5;
57
58 public static final int RIGHT_ID = 5;
59
60 public static UserDictionary open(Reader reader) throws IOException {
61
62 BufferedReader br = new BufferedReader(reader);
63 String line = null;
64 List<String[]> featureEntries = new ArrayList<>();
65
66
67 while ((line = br.readLine()) != null) {
68
69 line = line.replaceAll("#.*$", "");
70
71
72 if (line.trim().length() == 0) {
73 continue;
74 }
75 String[] values = CSVUtil.parse(line);
76 featureEntries.add(values);
77 }
78
79 if (featureEntries.isEmpty()) {
80 return null;
81 } else {
82 return new UserDictionary(featureEntries);
83 }
84 }
85
86 private UserDictionary(List<String[]> featureEntries) throws IOException {
87
88 int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
89
90
91
92 Collections.sort(featureEntries, new Comparator<String[]>() {
93 @Override
94 public int compare(String[] left, String[] right) {
95 return left[0].compareTo(right[0]);
96 }
97 });
98
99 List<String> data = new ArrayList<>(featureEntries.size());
100 List<int[]> segmentations = new ArrayList<>(featureEntries.size());
101
102 PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
103 Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
104 IntsRefBuilder scratch = new IntsRefBuilder();
105 long ord = 0;
106
107 for (String[] values : featureEntries) {
108 String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
109 String[] readings = values[2].replaceAll(" *", " ").split(" ");
110 String pos = values[3];
111
112 if (segmentation.length != readings.length) {
113 throw new RuntimeException("Illegal user dictionary entry " + values[0] +
114 " - the number of segmentations (" + segmentation.length + ")" +
115 " does not the match number of readings (" + readings.length + ")");
116 }
117
118 int[] wordIdAndLength = new int[segmentation.length + 1];
119 wordIdAndLength[0] = wordId;
120 for (int i = 0; i < segmentation.length; i++) {
121 wordIdAndLength[i + 1] = segmentation[i].length();
122 data.add(readings[i] + INTERNAL_SEPARATOR + pos);
123 wordId++;
124 }
125
126 String token = values[0];
127 scratch.grow(token.length());
128 scratch.setLength(token.length());
129 for (int i = 0; i < token.length(); i++) {
130 scratch.setIntAt(i, (int) token.charAt(i));
131 }
132 fstBuilder.add(scratch.get(), ord);
133 segmentations.add(wordIdAndLength);
134 ord++;
135 }
136 this.fst = new TokenInfoFST(fstBuilder.finish(), false);
137 this.data = data.toArray(new String[data.size()]);
138 this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
139 }
140
141
142
143
144
145
146
147
148 public int[][] lookup(char[] chars, int off, int len) throws IOException {
149
150 TreeMap<Integer, int[]> result = new TreeMap<>();
151 boolean found = false;
152
153 final FST.BytesReader fstReader = fst.getBytesReader();
154
155 FST.Arc<Long> arc = new FST.Arc<>();
156 int end = off + len;
157 for (int startOffset = off; startOffset < end; startOffset++) {
158 arc = fst.getFirstArc(arc);
159 int output = 0;
160 int remaining = end - startOffset;
161 for (int i = 0; i < remaining; i++) {
162 int ch = chars[startOffset+i];
163 if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
164 break;
165 }
166 output += arc.output.intValue();
167 if (arc.isFinal()) {
168 final int finalOutput = output + arc.nextFinalOutput.intValue();
169 result.put(startOffset-off, segmentations[finalOutput]);
170 found = true;
171 }
172 }
173 }
174
175 return found ? toIndexArray(result) : EMPTY_RESULT;
176 }
177
178 public TokenInfoFST getFST() {
179 return fst;
180 }
181
182 private static final int[][] EMPTY_RESULT = new int[0][];
183
184
185
186
187
188 private int[][] toIndexArray(Map<Integer, int[]> input) {
189 ArrayList<int[]> result = new ArrayList<>();
190 for (int i : input.keySet()) {
191 int[] wordIdAndLength = input.get(i);
192 int wordId = wordIdAndLength[0];
193
194 int current = i;
195 for (int j = 1; j < wordIdAndLength.length; j++) {
196 int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
197 result.add(token);
198 current += wordIdAndLength[j];
199 }
200 }
201 return result.toArray(new int[result.size()][]);
202 }
203
204 public int[] lookupSegmentation(int phraseID) {
205 return segmentations[phraseID];
206 }
207
208 @Override
209 public int getLeftId(int wordId) {
210 return LEFT_ID;
211 }
212
213 @Override
214 public int getRightId(int wordId) {
215 return RIGHT_ID;
216 }
217
218 @Override
219 public int getWordCost(int wordId) {
220 return WORD_COST;
221 }
222
223 @Override
224 public String getReading(int wordId, char surface[], int off, int len) {
225 return getFeature(wordId, 0);
226 }
227
228 @Override
229 public String getPartOfSpeech(int wordId) {
230 return getFeature(wordId, 1);
231 }
232
233 @Override
234 public String getBaseForm(int wordId, char surface[], int off, int len) {
235 return null;
236 }
237
238 @Override
239 public String getPronunciation(int wordId, char surface[], int off, int len) {
240 return null;
241 }
242
243 @Override
244 public String getInflectionType(int wordId) {
245 return null;
246 }
247
248 @Override
249 public String getInflectionForm(int wordId) {
250 return null;
251 }
252
253 private String[] getAllFeaturesArray(int wordId) {
254 String allFeatures = data[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
255 if(allFeatures == null) {
256 return null;
257 }
258
259 return allFeatures.split(INTERNAL_SEPARATOR);
260 }
261
262
263 private String getFeature(int wordId, int... fields) {
264 String[] allFeatures = getAllFeaturesArray(wordId);
265 if (allFeatures == null) {
266 return null;
267 }
268 StringBuilder sb = new StringBuilder();
269 if (fields.length == 0) {
270 for (String feature : allFeatures) {
271 sb.append(CSVUtil.quoteEscape(feature)).append(",");
272 }
273 } else if (fields.length == 1) {
274 sb.append(allFeatures[fields[0]]).append(",");
275 } else {
276 for (int field : fields){
277 sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
278 }
279 }
280 return sb.deleteCharAt(sb.length() - 1).toString();
281 }
282
283 }